In [1]:
# --- Task 1: Understand the Dataset ---

# First, we tell Python we need our special data assistant, pandas.
import pandas as pd

# Now, we ask pandas to open and read our CSV case file.
# We'll store all the data in a variable called 'df' (short for DataFrame).
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset  (3).csv")
print("File loaded successfully! Let's take a look...\n")


# 1. Display the first 10 rows [cite: 54]
# This is like peeking at the first 10 lines of our case file.
print("--- First 10 Rows of the Dataset ---")
print(df.head(10))
print("\n" + "="*50 + "\n")


# 2. Identify the data types of each column [cite: 55]
# This tells us if a column has numbers, text, etc.
print("--- Data Types of Each Column ---")
df.info()
print("\n" + "="*50 + "\n")


# 3. Check for missing values [cite: 56]
# This is our check for any blank spots in the file.
print("--- Missing Values in Each Column ---")
print(df.isnull().sum())
File loaded successfully! Let's take a look...

--- First 10 Rows of the Dataset ---
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   
5  9305-CDSKC  Female              0      No         No       8          Yes   
6  1452-KIOVK    Male              0      No        Yes      22          Yes   
7  6713-OKOMC  Female              0      No         No      10           No   
8  7892-POOKP  Female              0     Yes         No      28          Yes   
9  6388-TABGU    Male              0      No        Yes      62          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   
5               Yes     Fiber optic             No  ...              Yes   
6               Yes     Fiber optic             No  ...               No   
7  No phone service             DSL            Yes  ...               No   
8               Yes     Fiber optic             No  ...              Yes   
9                No             DSL            Yes  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract PaperlessBilling  \
0          No          No              No  Month-to-month              Yes   
1          No          No              No        One year               No   
2          No          No              No  Month-to-month              Yes   
3         Yes          No              No        One year               No   
4          No          No              No  Month-to-month              Yes   
5          No         Yes             Yes  Month-to-month              Yes   
6          No         Yes              No  Month-to-month              Yes   
7          No          No              No  Month-to-month               No   
8         Yes         Yes             Yes  Month-to-month              Yes   
9          No          No              No        One year               No   

               PaymentMethod MonthlyCharges  TotalCharges Churn  
0           Electronic check          29.85         29.85    No  
1               Mailed check          56.95        1889.5    No  
2               Mailed check          53.85        108.15   Yes  
3  Bank transfer (automatic)          42.30       1840.75    No  
4           Electronic check          70.70        151.65   Yes  
5           Electronic check          99.65         820.5   Yes  
6    Credit card (automatic)          89.10        1949.4    No  
7               Mailed check          29.75         301.9    No  
8           Electronic check         104.80       3046.05   Yes  
9  Bank transfer (automatic)          56.15       3487.95    No  

[10 rows x 21 columns]

==================================================

--- Data Types of Each Column ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB

==================================================

--- Missing Values in Each Column ---
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
In [6]:
# --- Task 2: Data Cleaning ---

# Let's bring in our data assistant, pandas.
import pandas as pd

# First, we load our case file again to start fresh.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset  (3).csv")
print("File loaded. Beginning cleaning process...\n")

# --- Step 1: Handle missing/incorrect values in 'TotalCharges' ---

# We tell pandas to try and turn the 'TotalCharges' column into numbers.
# If it finds something that isn't a number (like a blank space), it will mark it as a blank spot (NaN).
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Now, let's see if any blank spots appeared after that change.
print("--- Checking for blank spots after converting 'TotalCharges' to numbers ---")
print(df.isnull().sum())
print("\nAha! We found 11 blank spots in 'TotalCharges'!\n")

# Since these are blank, we'll fill them with the number 0.
df['TotalCharges'].fillna(0, inplace=True)
print("--- Blank spots have been filled with 0. Let's check again ---")
print(df.isnull().sum())
print("\nSuccess! No more blank spots.")
print("\n" + "="*50 + "\n")


# --- Step 2: Remove duplicate records ---

# Let's count how many duplicate rows we have.
print(f"Number of duplicate rows found: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True) # This command removes them.
print("Duplicate rows have been removed.\n")
print("="*50 + "\n")


# --- Step 3: Standardize column names ---

# This line takes all column names and makes them lowercase.
df.columns = [col.lower() for col in df.columns]

print("--- All column names are now standardized to lowercase ---")
print(df.columns.tolist())
File loaded. Beginning cleaning process...

--- Checking for blank spots after converting 'TotalCharges' to numbers ---
customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

Aha! We found 11 blank spots in 'TotalCharges'!

--- Blank spots have been filled with 0. Let's check again ---
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Success! No more blank spots.

==================================================

Number of duplicate rows found: 0
Duplicate rows have been removed.

==================================================

--- All column names are now standardized to lowercase ---
['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents', 'tenure', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod', 'monthlycharges', 'totalcharges', 'churn']
In [3]:
# --- Task 3: Exploratory Data Analysis (EDA) ---

# Part 0: SETUP - We'll get everything ready first.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

# Set a nice style for our charts
sns.set_style("whitegrid")

# --- Load and Clean the Data (Prerequisite Steps) ---
# We do this again to ensure our data is fresh and correct.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset  (3).csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna({'TotalCharges': 0}, inplace=True)
df.columns = [col.lower() for col in df.columns]
print("Data loaded and cleaned. Starting Task 3 analysis...\n")


# --- Step 1: Generate Summary Statistics ---
# This is like a quick report card for our main numerical columns.
print("--- Summary Statistics for Numerical Data ---")
print(df[['tenure', 'monthlycharges', 'totalcharges']].describe())
print("\n" + "="*50 + "\n")


# --- Step 2: Analyze Churn Rate ---
# Here we find out what percentage of customers left.
print("--- Churn Proportions ---")
churn_percentage = df['churn'].value_counts(normalize=True) * 100
print(churn_percentage)
print("\n")

# Now, let's make a simple bar chart to see the difference.
plt.figure(figsize=(8, 6))
sns.countplot(x='churn', data=df)
plt.title('Distribution of Customer Churn', fontsize=16)
plt.xlabel('Did the Customer Leave? (Churn)', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.show()


# --- Step 3: Create Visualizations for Numerical Columns ---
# We'll make a histogram and a box plot for each of our number columns.
numerical_features = ['tenure', 'monthlycharges', 'totalcharges']
for feature in numerical_features:
    plt.figure(figsize=(14, 5))
    
    # A histogram shows us the 'shape' of the data.
    plt.subplot(1, 2, 1)
    sns.histplot(df[feature], kde=True, bins=30)
    plt.title(f'Histogram of {feature.capitalize()}', fontsize=14)
    
    # A box plot shows us the 'spread' of the data in a different way.
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df[feature])
    plt.title(f'Box Plot of {feature.capitalize()}', fontsize=14)
    
    plt.tight_layout()
    plt.show()
Data loaded and cleaned. Starting Task 3 analysis...

--- Summary Statistics for Numerical Data ---
            tenure  monthlycharges  totalcharges
count  7043.000000     7043.000000   7043.000000
mean     32.371149       64.761692   2279.734304
std      24.559481       30.090047   2266.794470
min       0.000000       18.250000      0.000000
25%       9.000000       35.500000    398.550000
50%      29.000000       70.350000   1394.550000
75%      55.000000       89.850000   3786.600000
max      72.000000      118.750000   8684.800000

==================================================

--- Churn Proportions ---
churn
No     73.463013
Yes    26.536987
Name: proportion, dtype: float64


No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [4]:
# --- Task 4: Customer Segmentation Visualization ---

# Part 0: SETUP - Get our tools and data ready
import pandas as pd
import plotly.express as px  # We're using Plotly Express for our cool charts!
import warnings

# Ignore warnings for a clean output
warnings.filterwarnings('ignore')

# --- Load and Clean the Data (Prerequisite Steps) ---
# This makes sure our data is fresh and correct before we start Task 4.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset  (3).csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna({'TotalCharges': 0}, inplace=True)
df.columns = [col.lower() for col in df.columns]
print("Data loaded and cleaned. Starting Task 4 visualizations...\n")


# --- Part 1: Create Customer Groups Based on Tenure ---
# We'll slice our customers into three groups based on their 'tenure'.
print("--- Creating Tenure Groups ---")

# These are the "cut-off points" for our groups in months.
tenure_bins = [0, 12, 36, 73] 

# These are the labels we'll give to each group.
tenure_labels = ['0-12 Months', '13-36 Months', '37+ Months']

# This command creates the new 'tenure_group' column and sorts each customer.
df['tenure_group'] = pd.cut(df['tenure'], bins=tenure_bins, labels=tenure_labels, include_lowest=True)
print("Successfully sorted customers into tenure groups.\n")


# --- Part 2: Visualize Customer Distribution (Donut Chart) ---
# Let's see how many customers are in each group.

# First, we count the customers in each group.
tenure_distribution = df['tenure_group'].value_counts().reset_index()
tenure_distribution.columns = ['tenure_group', 'customer_count']

# Now, we create the interactive donut chart!
fig_donut = px.pie(tenure_distribution,
                   names='tenure_group',
                   values='customer_count',
                   title='Customer Distribution by Tenure Group',
                   hole=0.4, # This makes the hole in the middle!
                   color_discrete_sequence=px.colors.qualitative.Pastel)

# This adds the nice labels inside the chart slices.
fig_donut.update_traces(textposition='inside', textinfo='percent+label')

print("Displaying Donut Chart...")
fig_donut.show()


# --- Part 3: Compare Average Monthly Charges (Bar Chart) ---
# Let's find out if long-time customers pay more per month.

# First, we calculate the average monthly charge for each group.
avg_monthly_charges = df.groupby('tenure_group')['monthlycharges'].mean().round(2).reset_index()

# Now, we build our bar chart.
fig_bar = px.bar(avg_monthly_charges,
                 x='tenure_group',
                 y='monthlycharges',
                 title='Average Monthly Charges by Tenure Group',
                 text='monthlycharges',  # This adds the number labels on the bars
                 color='tenure_group',
                 labels={'tenure_group': 'Tenure Group', 'monthlycharges': 'Average Monthly Charge ($)'})

# This makes the labels on the bars look like money.
fig_bar.update_traces(texttemplate='$%{text}', textposition='outside')
fig_bar.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

print("\nDisplaying Bar Chart...")
fig_bar.show()
Data loaded and cleaned. Starting Task 4 visualizations...

--- Creating Tenure Groups ---
Successfully sorted customers into tenure groups.

Displaying Donut Chart...
Displaying Bar Chart...
In [5]:
# --- Task 5: Advanced Analysis ---

# Part 0: SETUP - Get our tools and data ready
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Ignore warnings for a clean output
warnings.filterwarnings('ignore')

# Set a nice style for our charts
sns.set_style("whitegrid")

# --- Load and Clean the Data (Prerequisite Steps) ---
# We do this one last time to ensure our data is fresh and correct.
df = pd.read_csv(r"C:\Users\himanshu\Downloads\Telco_Customer_Churn_Dataset  (3).csv")
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.fillna({'TotalCharges': 0}, inplace=True)
df.columns = [col.lower() for col in df.columns]
print("Data loaded and cleaned. Starting Final Analysis...\n")


# --- Part 1: Prepare for Churn Calculation ---
# To make calculating the churn rate easier, we'll turn the 'churn' column (which has "Yes" and "No")
# into a new column with numbers (1 for "Yes" and 0 for "No").
df['churn_numeric'] = df['churn'].apply(lambda x: 1 if x == 'Yes' else 0)
print("--- Prepared data for churn rate analysis ---")
print("Customers who churned are now marked as 1, and those who stayed as 0.\n")


# --- Part 2: Analyze Churn by Demographics ---
# Let's see if churn is different for gender or senior citizens.
print("--- Investigating Demographics... ---")
demographics = ['gender', 'seniorcitizen']
for col in demographics:
    plt.figure(figsize=(8, 6))
    # A bar plot is perfect for comparing the average churn rate between groups.
    sns.barplot(x=col, y='churn_numeric', data=df)
    plt.title(f'Churn Rate by {col.capitalize()}', fontsize=16)
    plt.ylabel('Churn Rate', fontsize=12)
    plt.xlabel(col.capitalize(), fontsize=12)
    plt.show()

    
# --- Part 3: Analyze Churn by Contract and Payment Method ---
# This is where we look for big clues related to the services.
print("\n--- Investigating Contracts and Payment Methods... ---")
contract_payment = ['contract', 'paymentmethod']
for col in contract_payment:
    plt.figure(figsize=(10, 6))
    # A count plot with 'hue' lets us see the 'Yes' vs 'No' churn bars for each category.
    sns.countplot(x=col, hue='churn', data=df)
    plt.title(f'Churn Distribution by {col.capitalize()}', fontsize=16)
    plt.xlabel(col.capitalize(), fontsize=12)
    plt.ylabel('Number of Customers', fontsize=12)
    plt.xticks(rotation=25, ha='right') # Rotate labels to prevent overlap
    plt.tight_layout()
    plt.show()

    
# --- Part 4: Visualize Churn Trends Over Customer Lifecycle ---
# Does churn change the longer someone is a customer? Let's find out!
print("\n--- Investigating Churn Rate over Time (Tenure)... ---")
plt.figure(figsize=(12, 6))
# We can create a line plot to see the trend.
sns.lineplot(data=df, x='tenure', y='churn_numeric')
plt.title('Churn Rate vs. Customer Tenure', fontsize=16)
plt.xlabel('Tenure (Months)', fontsize=12)
plt.ylabel('Churn Rate', fontsize=12)
plt.show()
Data loaded and cleaned. Starting Final Analysis...

--- Prepared data for churn rate analysis ---
Customers who churned are now marked as 1, and those who stayed as 0.

--- Investigating Demographics... ---
No description has been provided for this image
No description has been provided for this image
--- Investigating Contracts and Payment Methods... ---
No description has been provided for this image
No description has been provided for this image
--- Investigating Churn Rate over Time (Tenure)... ---
No description has been provided for this image